library(ggplot2)
library(dplyr)
library(MultiAssayExperiment)
library(limma)
library(EnhancedVolcano)
library(DT)
library(RColorBrewer)
library(readr)
library(ggVennDiagram)
Data Overview:
Load multiassay .rds file, extract clinical, expression and annotations data; prepare gene expression data for analysis.
# Load your multiassay result and extract clinical data , expression data and annotation
#load mae obj
mae <- readRDS("~/BHK lab/Ravi_Testing/output/ICB_Ravi.rds")
#extract Clinical data
clin <- data.frame(colData(mae))
#extract the expression data
expr <- assays(mae)[["expr"]]
#extracting the annotation
annot <- data.frame(rowData(mae@ExperimentList$expr))
# Display first few rows of the dataset
DT::datatable(expr[1:8, 1:4])
Objective: Perform PCA on gene expression data to assess variation across different research centers.
Data Preparation: Following the RNA-seq differential expression analysis section from a paper, the analysis of differentially expressed genes involved filtering for protein-coding transcripts with a minimum expression of log2TPM ≥ 0.5 in at least 30% of samples.
Therefore, steps for PCA-ready gene expression data include:
log2(TPM+0.001) to
log2(TPM+1) for consistency.log2TPM ≥ 0.5 across at least 30% of samples.# Step 1: Restrict to Protein-Coding Genes
# Filter out non-protein-coding transcripts for focused analysis
annot_proteincoding <- annot[annot$gene_type == "protein_coding",] # 19158 protein coding genes.
expr <- expr[rownames(expr) %in% rownames(annot_proteincoding),]
# Step 2: Normalize Expression Data
# Convert from expr (log2(TPM + 0.001)) to standard log2(TPM + 1) format
expr <- log2((2**expr - 0.001) + 1)
# Step 3: Filter Low/Zero Expression Genes
# threshold for expr >= 0.5 per transcript
thresh <- expr >= 0.5
# Identify and keep transcripts with sufficient expression levels
keep <- rowSums(thresh) >= (0.3 * ncol(expr))
expr <- expr[keep,] # Dimensions: 15,857 x 148 - including 15k protein-coding
# Display expr
DT::datatable(expr[1:8, 1:4])
#1. PCA
# Transpose expr data for PCA , samples as rows and genes as columns.
expr_t <- t(expr) #samples as rows and genes as columns
# Calculate PCA
pc <- prcomp(expr_t, center = TRUE, scale. = TRUE)
# Calculate and print the percentage of variance explained by each principal component
var_res <- pc$sdev^2 / sum(pc$sdev^2) * 100
var_res <- round(var_res, 2) # Round to 2 decimal places
print(var_res)
## [1] 28.19 5.71 4.15 3.85 3.35 2.99 2.63 2.24 2.10 2.06 1.97 1.73
## [13] 1.67 1.64 1.44 1.40 1.29 1.18 1.07 1.01 0.95 0.89 0.81 0.78
## [25] 0.72 0.66 0.64 0.63 0.59 0.57 0.54 0.52 0.50 0.46 0.46 0.45
## [37] 0.43 0.41 0.41 0.39 0.38 0.37 0.36 0.36 0.34 0.33 0.32 0.31
## [49] 0.31 0.30 0.29 0.29 0.29 0.28 0.27 0.26 0.26 0.25 0.25 0.25
## [61] 0.24 0.24 0.23 0.23 0.22 0.22 0.22 0.21 0.21 0.21 0.20 0.20
## [73] 0.19 0.19 0.19 0.18 0.18 0.17 0.17 0.17 0.17 0.16 0.16 0.16
## [85] 0.16 0.15 0.15 0.15 0.15 0.15 0.14 0.14 0.13 0.13 0.13 0.13
## [97] 0.12 0.12 0.12 0.12 0.12 0.12 0.11 0.11 0.11 0.11 0.10 0.10
## [109] 0.10 0.10 0.10 0.10 0.09 0.09 0.09 0.09 0.09 0.08 0.08 0.08
## [121] 0.08 0.08 0.08 0.07 0.07 0.07 0.07 0.07 0.06 0.06 0.06 0.06
## [133] 0.06 0.06 0.06 0.05 0.05 0.05 0.05 0.05 0.04 0.04 0.04 0.04
## [145] 0.03 0.03 0.03 0.00
# 2. Merge PCA and Clinical Subset
# Convert PCA results to a data frame
pcx <- data.frame(pc$x)
# Add patient IDs from row names to the data frame
pcx$patientid <- rownames(pcx)
# Merge PCA data with patient IDs and institutions from clinical data
pcx_merge <- merge(pcx, clin[, c("patientid", "Institution")], by="patientid")
# Set row names to patient IDs
rownames(pcx_merge) <- pcx_merge[,1]
# Removeredundant patient ID column
pcx_merge <- pcx_merge[,-1]
The bar plot displays PC1 as the longest, explaining the most variance, while subsequent bars (PC2, PC3, etc.) become progressively shorter, indicating diminishing variance.
#To find the largest PC of data
barplot(var_res, main="Bar Plot", xlab="Principal Component", ylab="Percentage of Variance ", col="skyblue")
Reason :To visualize variance by each principal component.
Result :PCA on gene expression data, shows that the ‘Institution’ factor has no significant effect on gene expression data sampling. (no Batch effect has been detected)
# Create labels for the plot
xlab_text <- paste("PC1 (", var_res[1], "%)", sep = "")
ylab_text <- paste("PC2 (", var_res[2], "%)", sep = "")
# Plot PCA results
ggplot(pcx_merge, aes(PC1, PC2, color = Institution)) +
theme_bw() +
geom_point() +
labs(x = xlab_text, y = ylab_text)
Harmonized_Confirmed_BOR_Bin:
Harmonized_Confirmed_BOR_Bin from the clinical dataset,
aiming to distinguish responders (PR/CR) from nonresponders (SD/PD).
This was mentioned in paper method section RNA-seq Differential
Expressionresponse:
response classification obtained via the Get_Response
script.# Filter expression data based on 'Harmonized_Confirmed_BOR_Bin' column in 'clin'
bor_filtered <- clin$Harmonized_Confirmed_BOR_Bin[!is.na(clin$Harmonized_Confirmed_BOR_Bin)]
expr_bor <- expr[, !is.na(clin$Harmonized_Confirmed_BOR_Bin)] # Remove 12 rows with missing responses
# Create design matrix and fit linear model
design_bor <- model.matrix(~ bor_filtered)
fit_bor <- lmFit(expr_bor, design_bor)
# Perform eBayes analysis and display results
fit_bor <- eBayes(fit_bor)
datatable(topTable(fit_bor))
# Repeat process for 'response' column in 'clin'
response_filtered <- clin$response[!is.na(clin$response)]
expr_response <- expr[, !is.na(clin$response)] # Remove 27 rows with missing responses
# Create design matrix and fit linear model
design_response <- model.matrix(~ response_filtered)
fit_response <- lmFit(expr_response, design_response)
# Perform eBayes analysis and display results
fit_response <- eBayes(fit_response)
datatable(topTable(fit_response))
Preparing Data for volcano Plot
Convert fit, both fit_bor and
fit_response, to Data frame and add column
gene_name symbol
# Retrieve all results from the analysis and convert
volcano_data_res <- topTable(fit_bor, number=Inf)
volcano_data_bor <- topTable(fit_bor, number=Inf)
# Convert the result to a data frame
df_res <- as.data.frame(volcano_data_res)
df_bor <- as.data.frame(volcano_data_bor)
# Subset 'gene_id_no_ver' and 'gene_name' from the gene data
subset_annot <- annot_proteincoding[, c("gene_id_no_ver", "gene_name")]
# Add a 'gene_id' column to the volcano_data
volcano_data_res$gene_id_no_ver <- rownames(volcano_data_res)
volcano_data_bor$gene_id_no_ver <- rownames(volcano_data_bor)
# Merge 'volcano_data' and 'annot_proteincoding' by 'gene_id'
merge_result_res <- merge(volcano_data_res, subset_annot, by = "gene_id_no_ver")
merge_result_bor <- merge(volcano_data_bor, subset_annot, by = "gene_id_no_ver")
# Display the merged result in a table format
datatable(merge_result_res)
datatable(merge_result_bor)
Fig. 2 in the paper details a Volcano plot from limma voom analysis,
displaying two-sided nominal P values. Genes are identified as
significantly differentially expressed using cutoffs of
|log2(fold change)| > 0.5 and
P < 0.05.
# 1.Volcano Plot based on P value:
# For merge_result_res:
EnhancedVolcano(merge_result_res,
lab = merge_result_res$gene_name,
x = 'logFC',
y = 'P.Value',
pCutoff = 0.05,
FCcutoff = 0.5,
xlim = c(-2, 2),
ylim = c(0, -log10(10e-4)),
title= 'Volcano Plot based on P value (based on BOR)'
)
# For merge_result_res:
EnhancedVolcano(merge_result_bor,
lab = merge_result_bor$gene_name,
x = 'logFC',
y = 'P.Value',
pCutoff = 0.05,
FCcutoff = 0.5,
xlim = c(-2, 2),
ylim = c(0, -log10(10e-4)),
title= 'Volcano Plot based on P value (Based on response)'
)